1 Load data

### Load required libraries
library(readxl)
library(tidyverse)
library(purrr)
library(dplyr)
library(knitr)
library(DT)
library(httr)
library(jsonlite)
library(stringr)
library(GSEABase)
library(org.Hs.eg.db)

### Load functions
source(file = "../02_functions.R")

### Define paths
current_dir <- getwd()
data_path <- file.path(current_dir,
                       "../../data/_raw")
results_pc_path <- file.path(current_dir,
                              "../../results/patient_characteristics/output")
results_fip_path <- file.path(current_dir,
                              "../../results/somatic_mutation_analysis/functional_impact_prediction/output")
results_gor_path <- file.path(current_dir, 
                                 "../../results/somatic_mutation_analysis/gene_overrepresentation_analysis/output")
figures_gor_path <- file.path(current_dir, 
                                 "../../results/somatic_mutation_analysis/gene_overrepresentation_analysis/figures")

maf_df_path <- file.path(results_fip_path, "all_mutations.csv") # All mutations df
nonsyn_df_path <- file.path(results_fip_path, "nonsyn_mutations.csv") # Nonsyn mutations df
lof_df_path <- file.path(results_fip_path, "lof_mutations.csv") # LoF mutations df

# Sup1 data
sup1_response_path <- file.path(results_pc_path, 
                           "sup1_response.csv")
### Read data & wrangle

# Read 3 lists of mutations

# All
maf_df_annotated <- read.csv(maf_df_path, header = TRUE)[ , -1]
list_of_all_mutations <- split(maf_df_annotated, maf_df_annotated$sample_id
                               )
# Nonsyn
nonsyn_df_annotated <- read.csv(nonsyn_df_path, header = TRUE)[ , -1]
list_of_nonsyn_mutations <- split(nonsyn_df_annotated, nonsyn_df_annotated$sample_id)

# LoF
lof_df_annotated <- read.csv(lof_df_path, header = TRUE)[ , -1]
list_of_lof_mutations <- split(lof_df_annotated, lof_df_annotated$sample_id)

# Get the list of patient IDs
sample_id_list <- unique(maf_df_annotated$sample_id)

## Read sup1_response.csv
sup1_response_df <- read.csv(sup1_response_path)[ , -1] # Drop first column

# df to merge response info later
response_df <- sup1_response_df %>% 
  dplyr::rename(sample_id = Sample.ID,
                patient_response = Patient.Response) %>% 
  dplyr::select(sample_id, patient_response)

# Create a list of Responders and Non-responders
sample_id_lists <- split(sup1_response_df$Sample.ID, 
                         sup1_response_df$Patient.Response) # Split the "Sample.ID" values into lists based on "Patient.Response"

r_sample_ids <- sample_id_lists$R
nr_sample_ids <- sample_id_lists$NR

2 Summary of mutations per patient

# Define output path
mm909_sum_plot_path <- file.path(figures_gor_path,
                              "mm909_sum_plot.png")

# Summary plot
mm909_sum_plot <- sum_plot(maf_df_annotated, nonsyn_df_annotated, lof_df_annotated, mm909_sum_plot_path)

# Display
print(mm909_sum_plot)

3 Pathway analysis (gene overrepresentation)

3.1 First analysis using C5 - BP

C5 consists of gene sets derived from the Gene Ontology (GO) annotations. Here, we focus on Gene sets derived from the GO Biological Process (BP) ontology.

# Define paths
fora_results_all_csv_path <- file.path(results_gor_path, "fora_results_c5bp_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_gor_path, "fora_results_c5bp_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_gor_path, "fora_results_c5bp_lof.csv")

# # Perform FORA analysis for C5:BP Gene Sets
# results_list <- perform_fora_analysis(list_of_all_mutations, list_of_nonsyn_mutations, list_of_lof_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, category="C5", subcategory="BP")

# Load fora results
fora_results_c5bp_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_c5bp_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_c5bp_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]

3.1.1 Visualization: Top C5 BP across samples stratified by response (All mutations)

# Define output path
mm909_c5bp_all_plot_path <- file.path(figures_gor_path,
                              "mm909_c5bp_all_plot.png")

mm909_c5bp_all_plot <- plot_top5(response_df, fora_results_c5bp_all_df, mm909_c5bp_all_plot_path, "FORA with top C5 BP across samples (All mutations)", "Top C5 BP Gene Sets")

print(mm909_c5bp_all_plot)

3.1.2 Visualization: Top C5 BP across samples stratified by response (Nonsyn mutations)

# Define output path
mm909_c5bp_nonsyn_plot_path <- file.path(figures_gor_path,
                              "mm909_c5bp_nonsyn_plot.png")

mm909_c5bp_nonsyn_plot <- plot_top5(response_df, fora_results_c5bp_nonsyn_df, mm909_c5bp_nonsyn_plot_path, "FORA with top C5 BP across samples (Nonsyn mutations)", "Top C5 BP Gene Sets")

print(mm909_c5bp_nonsyn_plot)

3.1.3 Visualization: Top C5 BP across samples stratified by response (LoF mutations)

# Define output path
mm909_c5bp_lof_plot_path <- file.path(figures_gor_path,
                              "mm909_c5bp_lof_plot.png")

mm909_c5bp_lof_plot <- plot_top5(response_df, fora_results_c5bp_lof_df, mm909_c5bp_lof_plot_path, "FORA with top C5 BP across samples (LoF mutations)", "Top C5 BP Gene Sets")

print(mm909_c5bp_lof_plot)

3.2 Second analysis using C5 - CC

C5 consists of gene sets derived from the Gene Ontology (GO) annotations. Here, we focus on Gene sets derived from the GO Cellular Component (CC) ontology.

# Define paths
fora_results_all_csv_path <- file.path(results_gor_path, "fora_results_c5cc_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_gor_path, "fora_results_c5cc_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_gor_path, "fora_results_c5cc_lof.csv")

# # Perform FORA analysis for H Gene Sets
# results_list <- perform_fora_analysis(list_of_all_mutations, list_of_nonsyn_mutations, list_of_lof_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, category="C5", subcategory="CC")

# Load fora results
fora_results_c5cc_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_c5cc_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_c5cc_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]

3.2.1 Visualization: C5 CC PROTEASOME across samples stratified by response (LoF mutations)

# Define output path
mm909_c5cc_proteasome_lof_plot_path <- file.path(figures_gor_path,
                              "mm909_c5cc_proteasome_lof_plot.png")

proteasome_genesets = c("GOCC_PROTEASOME_ACCESSORY_COMPLEX", "GOCC_PROTEASOME_COMPLEX", "GOCC_PROTEASOME_CORE_COMPLEX", "GOCC_PROTEASOME_CORE_COMPLEX_ALPHA_SUBUNIT_COMPLEX", "GOCC_PROTEASOME_CORE_COMPLEX_BETA_SUBUNIT_COMPLEX", "GOCC_PROTEASOME_REGULATORY_PARTICLE", "GOCC_PROTEASOME_REGULATORY_PARTICLE_BASE_SUBCOMPLEX", "GOCC_PROTEASOME_REGULATORY_PARTICLE_LID_SUBCOMPLEX")

mm909_c5cc_proteasome_lof_plot <- plot_geneset(proteasome_genesets, response_df, fora_results_c5cc_lof_df, mm909_c5cc_proteasome_lof_plot_path, "FORA with PROTEASOME-related C5 CC Gene Sets across samples (LoF mutations)", "PROTEASOME Gene Sets")

print(mm909_c5cc_proteasome_lof_plot)

3.3 Third analysis using H

Hallmark gene sets summarize and represent specific well-defined biological states or processes and display coherent expression.

# Define paths
fora_results_all_csv_path <- file.path(results_gor_path, "fora_results_h_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_gor_path, "fora_results_h_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_gor_path, "fora_results_h_lof.csv")

# # Perform FORA analysis for H Gene Sets
# results_list <- perform_fora_analysis(list_of_all_mutations, list_of_nonsyn_mutations, list_of_lof_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, category="H")

# Load fora results
fora_results_h_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_h_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_h_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]

3.3.1 Visualization: Top H across samples stratified by response (All mutations)

# Define output path
mm909_h_all_plot_path <- file.path(figures_gor_path,
                              "mm909_h_all_plot.png")

mm909_h_all_plot <- plot_top5(response_df, fora_results_h_all_df, mm909_h_all_plot_path, "FORA with top H across samples (All mutations)", "Top H Gene Sets")

print(mm909_h_all_plot)

3.3.2 Visualization: Top H across samples stratified by response (Nonsyn mutations)

# Define output path
mm909_h_nonsyn_plot_path <- file.path(figures_gor_path,
                              "mm909_h_nonsyn_plot.png")

mm909_h_nonsyn_plot <- plot_top5(response_df, fora_results_h_nonsyn_df, mm909_h_nonsyn_plot_path, "FORA with top H across samples (Nonsyn mutations)", "Top H Gene Sets")

print(mm909_h_nonsyn_plot)

3.3.3 Visualization: Top H across samples stratified by response (LoF mutations)

# Define output path
mm909_h_lof_plot_path <- file.path(figures_gor_path,
                              "mm909_h_lof_plot.png")

mm909_h_lof_plot <- plot_top5(response_df, fora_results_h_lof_df, mm909_h_lof_plot_path, "FORA with top H across samples (LoF mutations)", "Top H Gene Sets")

print(mm909_h_lof_plot)

3.3.4 Visualization: HALLMARK_APOPTOSIS across samples stratified by response (LoF)

Filter specifically for HALLMARK_APOPTOSIS gene set.

# Define output path
mm909_h_apoptosis_plot_path <- file.path(figures_gor_path,
                              "mm909_h_apoptosis_plot.png")

mm909_h_apoptosis_plot <- plot_geneset(c("HALLMARK_APOPTOSIS"), response_df, fora_results_h_lof_df, mm909_h_apoptosis_plot_path, "FORA with HALLMARK_APOPTOSIS across samples (LoF mutations)", "HALLMARK_APOPTOSIS Gene Set")

print(mm909_h_apoptosis_plot)

3.4 Fourth analysis using C6 (oncogenic)

Gene sets that represent signatures of cellular pathways which are often dis-regulated in cancer. The majority of signatures were generated directly from microarray data from NCBI GEO or from internal unpublished profiling experiments involving perturbation of known cancer genes.

# Define paths
fora_results_all_csv_path <- file.path(results_gor_path, "fora_results_c6_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_gor_path, "fora_results_c6_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_gor_path, "fora_results_c6_lof.csv")

# # Perform FORA analysis for C6 Gene Sets
# results_list <- perform_fora_analysis(list_of_all_mutations, list_of_nonsyn_mutations, list_of_lof_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, category="C6")

# Load fora results
fora_results_c6_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_c6_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_c6_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]

3.4.1 Visualization: Top C6 across samples stratified by response (All mutations)

# Define output path
mm909_c6_all_plot_path <- file.path(figures_gor_path,
                              "mm909_c6_all_plot.png")

mm909_c6_all_plot <- plot_top5(response_df, fora_results_c6_all_df, mm909_c6_all_plot_path, "FORA with top C6 across samples (All mutations)", "Top C6 Gene Sets")

print(mm909_c6_all_plot)

3.4.2 Visualization: Top C6 across samples stratified by response (Nonsyn mutations)

# Define output path
mm909_c6_nonsyn_plot_path <- file.path(figures_gor_path,
                              "mm909_c6_nonsyn_plot.png")

mm909_c6_nonsyn_plot <- plot_top5(response_df, fora_results_c6_nonsyn_df, mm909_c6_nonsyn_plot_path, "FORA with top C6 across samples (Nonsyn mutations)", "Top C6 Gene Sets")

print(mm909_c6_nonsyn_plot)

3.4.3 Visualization: Top C6 across samples stratified by response (LoF mutations)

# Define output path
mm909_c6_lof_plot_path <- file.path(figures_gor_path,
                              "mm909_c6_lof_plot.png")

mm909_c6_lof_plot <- plot_top5(response_df, fora_results_c6_lof_df, mm909_c6_lof_plot_path, "FORA with top C6 across samples (LoF mutations)", "Top C6 Gene Sets")

print(mm909_c6_lof_plot)

3.5 Fifth analysis using C7 (immunologic)

Gene sets that represent cell states and perturbations within the immune system.

# Define paths
fora_results_all_csv_path <- file.path(results_gor_path, "fora_results_c7_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_gor_path, "fora_results_c7_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_gor_path, "fora_results_c7_lof.csv")

# # Perform FORA analysis for C7 Gene Sets
# results_list <- perform_fora_analysis(list_of_all_mutations, list_of_nonsyn_mutations, list_of_lof_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, category="C7")

# Load fora results
fora_results_c7_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_c7_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_c7_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]

3.5.1 Visualization: Top C7 across samples stratified by response (All mutations)

# Define output path
mm909_c7_all_plot_path <- file.path(figures_gor_path,
                              "mm909_c7_all_plot.png")

mm909_c7_all_plot <- plot_top5(response_df, fora_results_c7_all_df, mm909_c7_all_plot_path, "FORA with top C7 across samples (All mutations)", "Top C7 Gene Sets")

print(mm909_c7_all_plot)

3.5.2 Visualization: Top C7 across samples stratified by response (Nonsyn mutations)

# Define output path
mm909_c7_nonsyn_plot_path <- file.path(figures_gor_path,
                              "mm909_c7_nonsyn_plot.png")

mm909_c7_nonsyn_plot <- plot_top5(response_df, fora_results_c7_nonsyn_df, mm909_c7_nonsyn_plot_path, "FORA with top C7 across samples (Nonsyn mutations)", "Top C7 Gene Sets")

print(mm909_c7_nonsyn_plot)

3.5.3 Visualization: Top C7 across samples stratified by response (LoF mutations)

# Define output path
mm909_c7_lof_plot_path <- file.path(figures_gor_path,
                              "mm909_c7_lof_plot.png")

mm909_c7_lof_plot <- plot_top5(response_df, fora_results_c7_lof_df, mm909_c7_lof_plot_path, "FORA with top C7 across samples (LoF mutations)", "Top C7 Gene Sets")

print(mm909_c7_lof_plot)

3.6 Sixth analysis using MSigDB filter of “Ubiquitin”

I searched for the term “ubiquitin” in MSigDB and downloaded a GMT file with all the hits (Gene Sets). Number of GS: 101.

# Define paths
fora_results_all_csv_path <- file.path(results_gor_path, "fora_results_ubiquitin_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_gor_path, "fora_results_ubiquitin_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_gor_path, "fora_results_ubiquitin_lof.csv")

# # Load GMT file as a GeneSetCollection object
# gene_set_collection <- getGmt(file.path(data_path, "genesets_ubiquitin.v2023.2.Hs.gmt"))
# 
# # Initialize an empty list to store the results
# gene_sets_list <- list()
# 
# # Iterate over each GeneSet in the GeneSetCollection
# for (setName in names(gene_set_collection)) {
#   # Extract the GeneSet by its name
#   geneSet <- gene_set_collection[[setName]]
# 
#   # Convert Hugo symbols to Ensembl IDs for this GeneSet
#   ensembl_ids <- SYMBOLtoENSEMBL(geneIds(geneSet))
# 
#   # Ensure the result is a character vector of Ensembl Gene IDs
#   ensembl_ids_vector <- as.character(ensembl_ids)
# 
#   # Store the converted IDs in the list, associated with the setName
#   gene_sets_list[[setName]] <- ensembl_ids_vector
# }
# 
# # Perform FORA analysis for Ubiquitin Gene Sets
# results_list <- perform_fora_analysis(list_of_all_mutations, list_of_nonsyn_mutations, list_of_lof_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, gene_sets_list=gene_sets_list)

# Load fora results
fora_results_ubiquitin_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_ubiquitin_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_ubiquitin_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]

3.6.1 Visualization: Top Ubiquitin GS across samples stratified by response (All mutations)

# Define output path
mm909_ubiquitin_all_plot_path <- file.path(figures_gor_path,
                              "mm909_ubiquitin_all_plot.png")

mm909_ubiquitin_all_plot <- plot_top5(response_df, fora_results_ubiquitin_all_df, mm909_ubiquitin_all_plot_path, "FORA with top Ubiquitin GS across samples (All mutations)", "Top Ubiquitin Gene Sets")

print(mm909_ubiquitin_all_plot)

3.6.2 Visualization: Top Ubiquitin GS across samples stratified by response (Nonsyn mutations)

# Define output path
mm909_ubiquitin_nonsyn_plot_path <- file.path(figures_gor_path,
                              "mm909_ubiquitin_nonsyn_plot.png")

mm909_ubiquitin_nonsyn_plot <- plot_top5(response_df, fora_results_ubiquitin_nonsyn_df, mm909_ubiquitin_nonsyn_plot_path, "FORA with top Ubiquitin GS across samples (Nonsyn mutations)", "Top Ubiquitin Gene Sets")

print(mm909_ubiquitin_nonsyn_plot)

3.6.3 Visualization: Top Ubiquitin GS across samples stratified by response (LoF mutations)

# Define output path
mm909_ubiquitin_lof_plot_path <- file.path(figures_gor_path,
                              "mm909_ubiquitin_lof_plot.png")

mm909_ubiquitin_lof_plot <- plot_top5(response_df, fora_results_ubiquitin_lof_df, mm909_ubiquitin_lof_plot_path, "FORA with top Ubiquitin GS across samples (LoF mutations)", "Top Ubiquitin Gene Sets")

print(mm909_ubiquitin_lof_plot)

3.7 Seventh analysis using C2:CGP (chemical & genetic perturbations)

Gene sets in this collection are curated from various sources, including online pathway databases and the biomedical literature. Many sets are also contributed by individual domain experts. Here, we focus on Chemical and genetic perturbations (CGP).

# Define paths
fora_results_all_csv_path <- file.path(results_gor_path, "fora_results_c2cgp_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_gor_path, "fora_results_c2cgp_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_gor_path, "fora_results_c2cgp_lof.csv")

# # Perform FORA analysis for C2_CGP Gene Sets
# results_list <- perform_fora_analysis(list_of_all_mutations, list_of_nonsyn_mutations, list_of_lof_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, category="C2", subcategory="CGP")

# Load fora results
fora_results_c2cgp_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_c2cgp_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_c2cgp_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]

3.7.1 Visualization: Top C2:CGP across samples stratified by response (All mutations)

# Define output path
mm909_c2cgp_all_plot_path <- file.path(figures_gor_path,
                              "mm909_c2cgp_all_plot.png")

mm909_c2cgp_all_plot <- plot_top5(response_df, fora_results_c2cgp_all_df, mm909_c2cgp_all_plot_path, "FORA with top C2:CGP across samples (All mutations)", "Top C2:CGP Gene Sets")

print(mm909_c2cgp_all_plot)

3.7.2 Visualization: Top C2:CGP across samples stratified by response (Nonsyn mutations)

# Define output path
mm909_c2cgp_nonsyn_plot_path <- file.path(figures_gor_path,
                              "mm909_c2cgp_nonsyn_plot.png")

mm909_c2cgp_nonsyn_plot <- plot_top5(response_df, fora_results_c2cgp_nonsyn_df, mm909_c2cgp_nonsyn_plot_path, "FORA with top C2:CGP across samples (Nonsyn mutations)", "Top C2:CGP Gene Sets")

print(mm909_c2cgp_nonsyn_plot)

3.7.3 Visualization: Top C2:CGP across samples stratified by response (LoF mutations)

# Define output path
mm909_c2cgp_lof_plot_path <- file.path(figures_gor_path,
                              "mm909_c2cgp_lof_plot.png")

mm909_c2cgp_lof_plot <- plot_top5(response_df, fora_results_c2cgp_lof_df, mm909_c2cgp_lof_plot_path, "FORA with top C2:CGP across samples (LoF mutations)", "Top C2:CGP Gene Sets")

print(mm909_c2cgp_lof_plot)

3.8 Eighth analysis using C2:CP:KEGG_MEDICUS

Gene sets in this collection are curated from various sources, including online pathway databases and the biomedical literature. Many sets are also contributed by individual domain experts. Here, we focus on Canonical pathways (CP), specifically Canonical Pathways gene sets derived from the KEGG MEDICUS pathway database.

# Define paths
fora_results_all_csv_path <- file.path(results_gor_path, "fora_results_keggmed_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_gor_path, "fora_results_keggmed_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_gor_path, "fora_results_keggmed_lof.csv")

# # Load GMT file as a GeneSetCollection object
# gene_set_collection <- getGmt(file.path(data_path, "c2.cp.kegg_medicus.v2023.2.Hs.entrez.gmt"))
# 
# # Initialize an empty list to store the results
# gene_sets_list <- list()
# 
# # Iterate over each GeneSet in the GeneSetCollection
# for (setName in names(gene_set_collection)) {
#   # Extract the GeneSet by its name
#   geneSet <- gene_set_collection[[setName]]
# 
#   # Convert Entrez IDs to Ensembl IDs for this GeneSet
#   ensembl_ids <- ENTREZtoENSEMBL(geneIds(geneSet))
# 
#   # Ensure the result is a character vector of Ensembl Gene IDs
#   ensembl_ids_vector <- as.character(ensembl_ids)
# 
#   # Store the converted IDs in the list, associated with the setName
#   gene_sets_list[[setName]] <- ensembl_ids_vector
# }
# 
# # Perform FORA analysis for KEGG_MEDICUS Gene Sets
# results_list <- perform_fora_analysis(list_of_all_mutations, list_of_nonsyn_mutations, list_of_lof_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, gene_sets_list=gene_sets_list)

# Load fora results
fora_results_keggmed_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_keggmed_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_keggmed_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]

3.8.1 Visualization: Top C2:CP:KEGG_MEDICUS across samples stratified by response (All mutations)

# Define output path
mm909_keggmed_all_plot_path <- file.path(figures_gor_path,
                              "mm909_keggmed_all_plot.png")

mm909_keggmed_all_plot <- plot_top5(response_df, fora_results_keggmed_all_df, mm909_keggmed_all_plot_path, "FORA with top C2:CP:KEGG_MEDICUS across samples (All mutations)", "Top C2:CP:KEGG_MEDICUS Gene Sets")

print(mm909_keggmed_all_plot)

3.8.2 Visualization: Top C2:CP:KEGG_MEDICUS across samples stratified by response (Nonsyn mutations)

# Define output path
mm909_keggmed_nonsyn_plot_path <- file.path(figures_gor_path,
                              "mm909_keggmed_nonsyn_plot.png")

mm909_keggmed_nonsyn_plot <- plot_top5(response_df, fora_results_keggmed_nonsyn_df, mm909_keggmed_nonsyn_plot_path, "FORA with top C2:CP:KEGG_MEDICUS across samples (Nonsyn mutations)", "Top C2:CP:KEGG_MEDICUS Gene Sets")

print(mm909_keggmed_nonsyn_plot)

3.8.3 Visualization: Top C2:CP:KEGG_MEDICUS across samples stratified by response (LoF mutations)

# Define output path
mm909_keggmed_lof_plot_path <- file.path(figures_gor_path,
                              "mm909_keggmed_lof_plot.png")

mm909_keggmed_lof_plot <- plot_top5(response_df, fora_results_keggmed_lof_df, mm909_keggmed_lof_plot_path, "FORA with top C2:CP:KEGG_MEDICUS across samples (LoF mutations)", "Top C2:CP:KEGG_MEDICUS Gene Sets")

print(mm909_keggmed_lof_plot)

3.9 Ninth analysis using C2:CP:REACTOME

Gene sets in this collection are curated from various sources, including online pathway databases and the biomedical literature. Many sets are also contributed by individual domain experts. Here, we focus on Canonical pathways (CP), specifically Canonical Pathways gene sets derived from the Reactome pathway database.

# Define paths
fora_results_all_csv_path <- file.path(results_gor_path, "fora_results_reactome_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_gor_path, "fora_results_reactome_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_gor_path, "fora_results_reactome_lof.csv")

# # Perform FORA analysis for C2_CGP Gene Sets
# results_list <- perform_fora_analysis(list_of_all_mutations, list_of_nonsyn_mutations, list_of_lof_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, category="C2", subcategory="CP:REACTOME")

# Load fora results
fora_results_reactome_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_reactome_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_reactome_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]

3.9.1 Visualization: Top C2:CP:REACTOME across samples stratified by response (All mutations)

# Define output path
mm909_reactome_all_plot_path <- file.path(figures_gor_path,
                              "mm909_reactome_all_plot.png")

mm909_reactome_all_plot <- plot_top5(response_df, fora_results_reactome_all_df, mm909_reactome_all_plot_path, "FORA with top C2:CP:REACTOME across samples (All mutations)", "Top C2:CP:REACTOME Gene Sets")

print(mm909_reactome_all_plot)

3.9.2 Visualization: Top C2:CP:REACTOME across samples stratified by response (Nonsyn mutations)

# Define output path
mm909_reactome_nonsyn_plot_path <- file.path(figures_gor_path,
                              "mm909_reactome_nonsyn_plot.png")

mm909_reactome_nonsyn_plot <- plot_top5(response_df, fora_results_reactome_nonsyn_df, mm909_reactome_nonsyn_plot_path, "FORA with top C2:CP:REACTOME across samples (Nonsyn mutations)", "Top C2:CP:REACTOME Gene Sets")

print(mm909_reactome_nonsyn_plot)

3.9.3 Visualization: Top C2:CP:REACTOME across samples stratified by response (LoF mutations)

# Define output path
mm909_reactome_lof_plot_path <- file.path(figures_gor_path,
                              "mm909_reactome_lof_plot.png")

mm909_reactome_lof_plot <- plot_top5(response_df, fora_results_reactome_lof_df, mm909_reactome_lof_plot_path, "FORA with top C2:CP:REACTOME across samples (LoF mutations)", "Top C2:CP:REACTOME Gene Sets")

print(mm909_reactome_lof_plot)

3.10 Tenth analysis using MSigDB filter of “Proteasome”

I searched for the term “proteasome” in MSigDB and downloaded a GMT file with all the hits (Gene Sets). Number of GS: 56.

# Define paths
fora_results_all_csv_path <- file.path(results_gor_path, "fora_results_proteasome_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_gor_path, "fora_results_proteasome_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_gor_path, "fora_results_proteasome_lof.csv")

# # Load GMT file as a GeneSetCollection object
# gene_set_collection <- getGmt(file.path(data_path, "genesets_proteasome.v2023.2.Hs.gmt"))
# 
# # Initialize an empty list to store the results
# gene_sets_list <- list()
# 
# # Iterate over each GeneSet in the GeneSetCollection
# for (setName in names(gene_set_collection)) {
#   # Extract the GeneSet by its name
#   geneSet <- gene_set_collection[[setName]]
# 
#   # Convert Hugo Symbols to Ensembl IDs for this GeneSet
#   ensembl_ids <- SYMBOLtoENSEMBL(geneIds(geneSet))
# 
#   # Ensure the result is a character vector of Ensembl Gene IDs
#   ensembl_ids_vector <- as.character(ensembl_ids)
# 
#   # Store the converted IDs in the list, associated with the setName
#   gene_sets_list[[setName]] <- ensembl_ids_vector
# }
# 
# # Perform FORA analysis for PROTEASOME Gene Sets
# results_list <- perform_fora_analysis(list_of_all_mutations, list_of_nonsyn_mutations, list_of_lof_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, gene_sets_list=gene_sets_list)

# Load fora results
fora_results_proteasome_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_proteasome_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_proteasome_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]

3.10.1 Visualization: Top PROTEASOME GS across samples stratified by response (All mutations)

# Define output path
mm909_proteasome_all_plot_path <- file.path(figures_gor_path,
                              "mm909_proteasome_all_plot.png")

mm909_proteasome_all_plot <- plot_top5(response_df, fora_results_proteasome_all_df, mm909_proteasome_all_plot_path, "FORA with top PROTEASOME GS across samples (All mutations)", "Top PROTEASOME Gene Sets")

print(mm909_proteasome_all_plot)

3.10.2 Visualization: Top PROTEASOME GS across samples stratified by response (Nonsyn mutations)

# Define output path
mm909_proteasome_nonsyn_plot_path <- file.path(figures_gor_path,
                              "mm909_proteasome_nonsyn_plot.png")

mm909_proteasome_nonsyn_plot <- plot_top5(response_df, fora_results_proteasome_nonsyn_df, mm909_proteasome_nonsyn_plot_path, "FORA with top PROTEASOME GS across samples (Nonsyn mutations)", "Top PROTEASOME Gene Sets")

print(mm909_proteasome_nonsyn_plot)

3.10.3 Visualization: Top PROTEASOME GS across samples stratified by response (LoF mutations)

# Define output path
mm909_proteasome_lof_plot_path <- file.path(figures_gor_path,
                              "mm909_proteasome_lof_plot.png")

mm909_proteasome_lof_plot <- plot_top5(response_df, fora_results_proteasome_lof_df, mm909_proteasome_lof_plot_path, "FORA with top PROTEASOME GS across samples (LoF mutations)", "Top PROTEASOME Gene Sets")

print(mm909_proteasome_lof_plot)

3.10.4 Visualization: KEGG_PROTEASOME GS across samples stratified by response (LoF mutations)

Filter specifically for KEGG_PROTEASOME gene set.

# Define output path
mm909_keggproteasome_lof_plot_path <- file.path(figures_gor_path,
                              "mm909_proteasome_lof_plot.png")

geneset = c("KEGG_PROTEASOME")

mm909_keggproteasome_lof_plot <- plot_geneset(geneset, response_df, fora_results_proteasome_lof_df, mm909_keggproteasome_lof_plot_path, "FORA with KEGG_PROTEASOME GS across samples (LoF mutations)", "KEGG_PROTEASOME Gene Set")

print(mm909_keggproteasome_lof_plot)

4 Session Info

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.7.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Copenhagen
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] org.Hs.eg.db_3.18.0  GSEABase_1.64.0      graph_1.80.0        
##  [4] annotate_1.80.0      XML_3.99-0.16.1      AnnotationDbi_1.64.1
##  [7] IRanges_2.36.0       S4Vectors_0.40.2     Biobase_2.62.0      
## [10] BiocGenerics_0.48.1  jsonlite_1.8.8       httr_1.4.7          
## [13] DT_0.31              knitr_1.45           lubridate_1.9.3     
## [16] forcats_1.0.0        stringr_1.5.1        dplyr_1.1.4         
## [19] purrr_1.0.2          readr_2.1.5          tidyr_1.3.1         
## [22] tibble_3.2.1         ggplot2_3.4.4        tidyverse_2.0.0     
## [25] readxl_1.4.3        
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.0        farver_2.1.1            blob_1.2.4             
##  [4] Biostrings_2.70.2       bitops_1.0-7            fastmap_1.1.1          
##  [7] RCurl_1.98-1.14         digest_0.6.34           timechange_0.3.0       
## [10] lifecycle_1.0.4         KEGGREST_1.42.0         RSQLite_2.3.5          
## [13] magrittr_2.0.3          compiler_4.3.2          rlang_1.1.3            
## [16] sass_0.4.8              tools_4.3.2             utf8_1.2.4             
## [19] yaml_2.3.8              labeling_0.4.3          htmlwidgets_1.6.4      
## [22] bit_4.0.5               withr_3.0.0             grid_4.3.2             
## [25] fansi_1.0.6             xtable_1.8-4            colorspace_2.1-0       
## [28] scales_1.3.0            cli_3.6.2               rmarkdown_2.25         
## [31] crayon_1.5.2            ragg_1.2.7              generics_0.1.3         
## [34] rstudioapi_0.15.0       tzdb_0.4.0              DBI_1.2.1              
## [37] cachem_1.0.8            zlibbioc_1.48.0         cellranger_1.1.0       
## [40] XVector_0.42.0          vctrs_0.6.5             hms_1.1.3              
## [43] bit64_4.0.5             systemfonts_1.0.5       jquerylib_0.1.4        
## [46] glue_1.7.0              stringi_1.8.3           gtable_0.3.4           
## [49] GenomeInfoDb_1.38.6     munsell_0.5.0           pillar_1.9.0           
## [52] htmltools_0.5.7         GenomeInfoDbData_1.2.11 R6_2.5.1               
## [55] textshaping_0.3.7       evaluate_0.23           highr_0.10             
## [58] png_0.1-8               memoise_2.0.1           bslib_0.6.1            
## [61] xfun_0.42               pkgconfig_2.0.3